In [ ]:
library = c("dplyr","GGally","readxl","ggplot2","ggmap","data.table",
"R.utils","DT","maps","rJava","corrplot","caret","IRdisplay")
for(i in 1:length(library)){library(library[i], character.only = T)}
In [ ]:
House = readxl::read_excel("fs/house/kc_house.xlsx",
sheet = "data",
col_names = TRUE)
House$date<-(substr(House$date, 1, 8))
House$date = as.numeric(House$date)
House$per.price = House$price / House$sqft_living
In [5]:
# summary(House)
# View(House)
# str(House)
colSums(is.na(House))
# 위 3단계 과정은 한 번쯤 다 보셨을 테니 생략
# colSums()를 통해 Missing Value 확인
In [4]:
# train 70% / test 30%
# trian 을 통해 모형을 만들고, test를 통해 해당 모형 검증
set.seed(14)
ratio = sample(1:nrow(House), size = 0.30*nrow(House))
test = House[-ratio,] #Test dataset 30% of total
train = House[ratio,] #Train dataset 70% of total
In [31]:
# 종속변수(price)에 영향을 미치는 변수를 찾기 위해 corrplot을 활용하여 분석한 결과, 아래와 같은 결과를 얻었음.
train.num <- cor(train) ## House_num 생성 후 상관관계 분석치 할당
train.num = round(train.num, 3) ## 소수점 셋째자리까지 출력
corrplot::corrplot(train.num,
method = "number",
shade.col = NA,
tl.col = "black",
tl.srt = 40,
order = "hclust",
diag = FALSE,
number.cex = 0.6)
In [16]:
analysis = train %>%
select(price, zipcode, sqft_living, grade, bathrooms, bedrooms, lat, long)
In [6]:
KingCounty.all = ggmap::get_map(location = c(lon=-122.3, lat=47.4),
zoom = 9)
KingCounty.all.1 = ggmap(KingCounty.all) + stat_density_2d(data = train, aes(x = long, y = lat, fill=..level.., alpha=..level..), geom = "polygon", size = 1, bins= 20)
KingCounty.all.1
In [8]:
## 위도 47.5를 기준으로 남 / 북으로 데이터를 구분
train$north = ifelse(train$lat >= 47.5 ,1 ,0)
north = train %>%
filter(north == 1)
south = train %>%
filter(north == 0)
KingCounty.all.2 = ggmap(KingCounty.all) + geom_point(data = north, aes(x = long, y = lat),
color = "blue", size = 1) + geom_point(data = south, aes(x = long, y = lat), color = "red", size = 1)
KingCounty.all.2
In [9]:
train %>%
select(price, north) %>%
group_by(north) %>%
summarise(max.price = max(price),
min.price = min(price),
mean.price = mean(price),
median.price = median(price),
n = n())
In [23]:
analysis$per.price = analysis$price / analysis$sqft_living # per.price 변수 생성
# zipcode 별로 grouping하여 최대, 최소, 평균, 중위수, 데이터 개수 계산
zipcode.price = analysis %>%
select(zipcode, per.price) %>%
group_by(zipcode) %>%
summarise(max.price = max(per.price),
min.price = min(per.price),
mean.price = mean(per.price),
median.price = median(per.price),
n = n())
nrow(zipcode.price) # zipcode는 총 70개
head(zipcode.price) # 각 zipcode별로 34 ~ 430의 데이터들이 존재
In [27]:
# zipcode별 평균 - 중위수 차이 계산
zipcode.price$dif.price = round(zipcode.price$mean.price - zipcode.price$median.price, digits = 2)
zipcode.price = zipcode.price[order(zipcode.price$dif.price),]
head(zipcode.price, n = 5)
tail(zipcode.price, n = 5)
In [29]:
## 100단위로 dummy변수 대입
zipcode.price$median.price = round(zipcode.price$median.price)
zipcode.price$dummy.median = as.numeric(substr(zipcode.price$median.price,1,1))*100
head(zipcode.price)
In [30]:
## zipcode.price에서 zipcode, dummy.median를 추출하여 mg데이터 생성
mg = zipcode.price %>%
select(zipcode, dummy.median)
## analysis와 mg를 zipcode(PK)기준으로 left outer join
analysis1 = merge(analysis, mg, by = "zipcode", all.x=TRUE)
analysis1$dummy.median = as.factor(analysis1$dummy.median)
## 구글맵에 시각화
KingCounty = ggmap::get_map(location = c(lon=-122.3, lat=47.4),
zoom = 9)
KingCounty1 = ggmap(KingCounty) + geom_point(data = analysis1, aes(x = long, y = lat, colour = dummy.median))
KingCounty1
In [9]:
IRdisplay::display_png(file="image/img01.PNG")
In [40]:
## median.price 가 200~300 사이인 데이터만 추출
zipcode.price1 = zipcode.price %>%
filter(median.price >= 200 & median.price <300) %>%
select(zipcode, median.price)
## 나머지를 구해서 10의자리 계산
zipcode.price1$dummy.median = as.numeric(substr(zipcode.price1$median.price,1,2))%%10
head(zipcode.price1)
In [42]:
## analysis와 zipcode.price1을 zipcode(PK)기준으로 left outer join
analysis2 = merge(analysis, zipcode.price1, by = "zipcode", all.x = TRUE)
analysis2$dummy.median = as.factor(analysis2$dummy.median) # factor로 변환
## 구글맵에 시각화
KingCounty2 = ggmap(KingCounty) + geom_point(data = analysis2, aes(x = long, y = lat, colour = dummy.median))
KingCounty2
In [44]:
## per.price 가 200대인 집들을 시각화한 결과를 바탕으로
## 200 초반대 까지는 100에
## 200 중반은 200으로
## 200 후반 값을 가지는 8,9는 300에 같이 그룹핑
zipcode.price2 = zipcode.price
zipcode.price2 = within(zipcode.price2,{
dummy.median = numeric(0)
dummy.median [ median.price < 220] = 1
dummy.median [ median.price >= 221 & median.price <= 279 ] = 2
dummy.median [ median.price >= 280 & median.price <= 399 ] = 3
dummy.median [ median.price >= 400 & median.price <= 499 ] = 4
dummy.median [ median.price >= 500 & median.price <= 599 ] = 5
dummy.median [ median.price >= 600] = 6
})
head(zipcode.price2)
In [71]:
## zipcode.price2로부터 zipcode와 dummy.median을 추출한 mg1 데이터를 만들어 주고
mg1 = zipcode.price2 %>%
select(zipcode, dummy.median)
head(mg1)
##analysis와 mg1를 zipcode(PK)기준으로 left outer join
analysis3 = merge(analysis, mg1, by = "zipcode", all.x=TRUE)
analysis3$dummy.median = as.factor(analysis3$dummy.median) # dummy.median을 factor로 변환
In [47]:
## 구글맵에 시각화
KingCounty3 = ggmap(KingCounty) + geom_point(data = analysis3, aes(x = long, y = lat, colour = dummy.median))
KingCounty3
In [51]:
result = analysis3 %>%
select(price, bedrooms, bathrooms, grade, sqft_living, dummy.median)
In [53]:
## price와 sqft_living 은 숫자 단위가 크므로 log를 취함
model.1 = lm(log(price) ~ bedrooms + bathrooms + grade + log(sqft_living) + factor(dummy.median), data = result)
summary(model.1)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.671678 0.051588 148.711 <2e-16 ***
bedrooms -0.024589 0.002639 -9.318 <2e-16 ***
bathrooms 0.010022 0.004001 2.505 0.0123 *
grade 0.113577 0.002642 42.995 <2e-16 ***
log(sqft_living) 0.565491 0.008952 63.167 <2e-16 ***
factor(dummy.median)2 0.351462 0.004677 75.147 <2e-16 ***
factor(dummy.median)3 0.597608 0.004919 121.500 <2e-16 ***
factor(dummy.median)4 0.851582 0.009680 87.975 <2e-16 ***
factor(dummy.median)5 1.169356 0.039862 29.335 <2e-16 ***
In [56]:
lm.beta::lm.beta(model.1)
In [61]:
model.2 = lm(log(price) ~ grade + log(sqft_living) + factor(dummy.median), data = result)
summary(model.2)
In [64]:
vif(model.2)
In [68]:
lm.beta::lm.beta(model.2)
In [81]:
final = lm(log(price) ~ grade + log(sqft_living) + factor(dummy.median), data = result)
In [82]:
## test데이터를 불러와 log.price변수를 추가
test = merge(test, mg1, by = "zipcode", all.x=TRUE)
test$log.price = log(test$price)
head(test)
In [84]:
## 생성된 모델을 바탕으로 test를 예측
pred = predict(final,
newdata = test,
interval = "predict")
pred = as.data.table(pred)
In [86]:
## pred는 test를 얼마나 예측 했을까?
pred.final = pred
test.final = test %>%
select(price, log.price)
## test의 log.price가 예측 범위 내에 얼마나 존재하는지 확인
## lwr(예측값) <= log.price(실제값) <= upr(예측값)
pred.final = cbind(test.final, pred.final)
pred.final$result = ifelse( (pred.final$lwr <= pred.final$log.price) & (pred.final$log.price <= pred.final$upr) ,1 ,0)
table(pred.final$result)
In [88]:
(6172 / (6172+311)) * 100